3 General statistics

3.1 Sample statistics

3.1.1 Number of specimens

read_tsv("data/sample.tsv") %>%
  select(specimen_id) %>%
  unique() %>%
  nrow()
[1] 4364

3.1.2 Number of species

read_tsv("data/sample.tsv") %>%
  select(specimen_species) %>%
  unique() %>%
  nrow()
[1] 244

3.1.3 Number of orders

read_tsv("data/sample.tsv") %>%
  select(specimen_order) %>%
  unique() %>%
  nrow()
[1] 22

3.1.4 Origin of samples (Figure S1)

read_tsv("data/sample.tsv") %>%
  #subset columns
  select(
    sample_id,
    specimen_species,
    specimen_order,
    specimen_class,
    capture_latitude,
    capture_longitude
  ) %>%
  #Add jitter to points
  mutate(
    capture_latitude_jitter=capture_latitude+rnorm(length(capture_latitude), mean=0, sd=0.5),
    capture_longitude_jitter=capture_longitude+rnorm(length(capture_longitude), mean=0, sd=0.5),
  ) %>%
  #Plot map  
  ggplot(.) +
    geom_map(
      data=map_data("world"),
      map = map_data("world"),
      aes(long, lat, map_id=region),
      color = "white", fill = "#cccccc", size = 0.2
    ) +
    geom_point(
      aes(x=capture_longitude_jitter,y=capture_latitude_jitter, color=specimen_order),
      alpha=0.5, size=0.5, shape=16) +
    labs(color="Taxonomic order") +
    theme_minimal() +
    theme(
      axis.title.x=element_blank(),
      axis.title.y=element_blank(),
      legend.position = "bottom")

3.2 Data statistics

3.2.1 Total data

read_tsv("data/preprocessing.tsv") %>%
  mutate(bases_pre_fastp = bases_pre_fastp / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(bases_pre_fastp, na.rm = TRUE),
    mean= mean(bases_pre_fastp, na.rm = TRUE),
    sd = sd(bases_pre_fastp, na.rm = TRUE)
  ) %>%
  tt()
tinytable_qnsrs1cvs4it1a6qr9t7
total mean sd
14697.42 5.860216 5.044053

3.2.2 Quality-filtered data

read_tsv("data/preprocessing.tsv") %>%
  mutate(bases_post_fastp = bases_post_fastp / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(bases_post_fastp, na.rm = TRUE),
    mean= mean(bases_post_fastp, na.rm = TRUE),
    sd = sd(bases_post_fastp, na.rm = TRUE)
  ) %>%
  tt()
tinytable_d8yr4xjf0ohrpfx4pwrl
total mean sd
13486.81 5.377518 4.586315

3.2.3 Host genomic data

read_tsv("data/preprocessing.tsv") %>%
  mutate(host_bases = host_bases / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(host_bases, na.rm = TRUE),
    mean= mean(host_bases, na.rm = TRUE),
    sd = sd(host_bases, na.rm = TRUE)
  ) %>%
  tt()
tinytable_qv66yk77kikwlw90cypa
total mean sd
5554.962 2.193903 3.710612

3.2.4 Metagenomic data

read_tsv("data/preprocessing.tsv") %>%
  mutate(metagenomic_bases = metagenomic_bases / 1000000000)  %>% #convert bases to gigabases (GB)
  summarise(
    total= sum(metagenomic_bases, na.rm = TRUE),
    mean= mean(metagenomic_bases, na.rm = TRUE),
    sd = sd(metagenomic_bases, na.rm = TRUE)
  ) %>%
  tt()
tinytable_kn3dqftzgys4ke5na33v
total mean sd
7931.853 3.132643 3.272361

3.2.5 Assemblies

read_tsv("data/assembly.tsv") %>%
  group_by(assembly_type) %>% 
  summarise(assembly_n=n()) %>%
  tt()
tinytable_12ohfw4zstpyx6bxi5ne
assembly_type assembly_n
Coassembly 2228
Individual 1722
Multisplit 7
NA 1

3.2.6 MAGs

read_tsv("data/mag.tsv") %>%
  nrow()
[1] 49957